library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(magrittr)
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
library(ggpubr)
library(FinancialMath)
library(ggsignif)
Neighborhood - Zipcode Matching Data: We’ll need this to assign each property to a neighborhood
neighborhood_data <- read_csv("data/nyc_zip_borough_neighborhoods_pop.csv")
## Rows: 177 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): borough, post_office, neighborhood
## dbl (3): zip, population, density
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
neighborhood_data
## # A tibble: 177 × 6
## zip borough post_office neighborhood population density
## <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 10001 Manhattan New York, NY Chelsea and Clinton 21102 33959
## 2 10002 Manhattan New York, NY Lower East Side 81410 92573
## 3 10003 Manhattan New York, NY Lower East Side 56024 97188
## 4 10004 Manhattan New York, NY Lower Manhattan 3089 5519
## 5 10005 Manhattan New York, NY Lower Manhattan 7135 97048
## 6 10006 Manhattan New York, NY Lower Manhattan 3011 32796
## 7 10007 Manhattan New York, NY Lower Manhattan 6988 42751
## 8 10009 Manhattan New York, NY Lower East Side 61347 99492
## 9 10010 Manhattan New York, NY Gramercy Park and Murray Hill 31834 81487
## 10 10011 Manhattan New York, NY Chelsea and Clinton 50984 77436
## # … with 167 more rows
Rental Data:
rental_data <- read_csv("data/manhattan-ny_rent_2024_03_15.csv")
## New names:
## Rows: 6278 Columns: 17
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): listing_type, unit_description, detailed_url, unit_address, unit_a... dbl
## (10): ...1, Unnamed: 0.1, Unnamed: 0, latitude, longitude, unit_zipcode,...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
rental_data %<>% left_join(neighborhood_data, by=c("unit_zipcode"="zip")) %>% filter(borough == "Manhattan") # attach neighborhood data
rental_data
## # A tibble: 6,274 × 22
## ...1 Unnam…¹ Unnam…² listi…³ unit_…⁴ detai…⁵ latit…⁶ longi…⁷ unit_…⁸ unit_…⁹
## <dbl> <dbl> <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 0 0 0 nested Riverb… /apart… 40.8 -74.0 560 W … 560 W …
## 2 1 1 1 nested The Ca… /apart… 40.7 -74.0 776 6t… 776 6t…
## 3 2 2 2 nested Waters… /apart… 40.7 -74.0 25 Wat… 25 Wat…
## 4 3 3 3 nested Pearl … /apart… 40.7 -74.0 160 Wa… 160 Wa…
## 5 4 4 4 nested Avalon… /apart… 40.7 -74.0 11 E 1… 11 E …
## 6 5 5 5 nested 21 Che… /apart… 40.7 -74.0 120 W … 120 W …
## 7 6 6 6 nested The Sa… /apart… 40.8 -74.0 189 W … 189 W …
## 8 7 7 7 nested 19 Dut… /apart… 40.7 -74.0 19 Dut… 19 Dut…
## 9 8 8 8 nested Anagra… /apart… 40.8 -74.0 1 W 60… 1 W 6…
## 10 9 9 9 nested The Ch… /apart… 40.7 -74.0 160 W … 160 W …
## # … with 6,264 more rows, 12 more variables: unit_city <chr>,
## # unit_zipcode <dbl>, beds <dbl>, baths <dbl>, area <dbl>, price <dbl>,
## # unit_number <chr>, borough <chr>, post_office <chr>, neighborhood <chr>,
## # population <dbl>, density <dbl>, and abbreviated variable names
## # ¹`Unnamed: 0.1`, ²`Unnamed: 0`, ³listing_type, ⁴unit_description,
## # ⁵detailed_url, ⁶latitude, ⁷longitude, ⁸unit_address, ⁹unit_address_street
Sales Data:
sales_data <- read_csv("data/manhattan-ny_sale_2024_03_15.csv")
## New names:
## • `` -> `...1`
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 6992 Columns: 55
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (23): rawHomeStatusCd, marketingStatusSimplifiedCd, imgSrc, detailUrl, ...
## dbl (11): ...1, zpid, id, providerListingId, unformattedPrice, addressZipco...
## lgl (19): hasImage, isUndisclosedAddress, isZillowOwned, isSaved, isUserCla...
## dttm (2): openHouseStartDate, openHouseEndDate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#rename some variables
sales_data$price <- sales_data$unformattedPrice
sales_data$sqft <- sales_data$area
sales_data %<>% left_join(neighborhood_data, by=c("addressZipcode"="zip")) %>% filter(borough == "Manhattan")
sales_data
## # A tibble: 6,984 × 61
## ...1 zpid id rawHo…¹ marke…² provi…³ imgSrc hasIm…⁴ detai…⁵ statu…⁶
## <dbl> <dbl> <dbl> <chr> <chr> <dbl> <chr> <lgl> <chr> <chr>
## 1 43 3.35e8 3.35e8 ForSale For Sa… 1694903 https… TRUE https:… FOR_SA…
## 2 44 1.20e8 1.20e8 ForSale For Sa… 1681982 https… TRUE https:… FOR_SA…
## 3 45 2.06e9 2.06e9 ForSale For Sa… 1700320 https… TRUE https:… FOR_SA…
## 4 46 2.08e9 2.08e9 ForSale For Sa… 1698963 https… TRUE https:… FOR_SA…
## 5 47 2.45e8 2.45e8 ForSale For Sa… 1682103 https… TRUE https:… FOR_SA…
## 6 48 2.05e9 2.05e9 ForSale For Sa… 1690340 https… TRUE https:… FOR_SA…
## 7 49 2.06e9 2.06e9 ForSale For Sa… NA https… TRUE https:… FOR_SA…
## 8 50 2.09e9 2.09e9 ForSale For Sa… 1454319 https… TRUE https:… FOR_SA…
## 9 51 2.06e9 2.06e9 ForSale For Sa… 1672430 https… TRUE https:… FOR_SA…
## 10 52 2.06e9 2.06e9 ForSale For Sa… 1632998 https… TRUE https:… FOR_SA…
## # … with 6,974 more rows, 51 more variables: statusText <chr>,
## # countryCurrency <chr>, price <dbl>, unformattedPrice <dbl>, address <chr>,
## # addressStreet <chr>, addressCity <chr>, addressState <chr>,
## # addressZipcode <dbl>, isUndisclosedAddress <lgl>, beds <dbl>, baths <dbl>,
## # area <dbl>, latLong <chr>, isZillowOwned <lgl>, variableData <chr>,
## # hdpData <chr>, isSaved <lgl>, isUserClaimingOwner <lgl>,
## # isUserConfirmedClaim <lgl>, pgapt <chr>, sgapt <chr>, …
Annotated Sales Data: (this is data where which we have more than 2 historical sales) 1583/6992 = 0.22, so we have about 22% of the sales properties actively listed
sales_data_annotated <- read_csv("data/manhattan-ny_sale_2024_03_15_with_cagr_and_hoa_fees.csv")
## New names:
## Rows: 1583 Columns: 65
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (25): rawHomeStatusCd, marketingStatusSimplifiedCd, imgSrc, detailUrl, ... dbl
## (20): ...1, Unnamed: 0, zpid, id, providerListingId, unformattedPrice, ... lgl
## (18): hasImage, isUndisclosedAddress, isZillowOwned, isSaved, isUserCla... dttm
## (2): openHouseStartDate, openHouseEndDate
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
#rename some variables
sales_data_annotated$price <- sales_data_annotated$unformattedPrice
sales_data_annotated$sqft <- sales_data_annotated$area
sales_data_annotated %<>% left_join(neighborhood_data, by=c("addressZipcode"="zip")) %>% filter(borough == "Manhattan")
sales_data_annotated
## # A tibble: 1,583 × 71
## ...1 Unnamed:…¹ zpid id rawHo…² marke…³ provi…⁴ imgSrc hasIm…⁵ detai…⁶
## <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <chr> <lgl> <chr>
## 1 3 44 1.20e8 1.20e8 ForSale For Sa… 1681982 https… TRUE https:…
## 2 21 62 8.20e7 8.20e7 ForSale For Sa… 1682235 https… TRUE https:…
## 3 38 79 9.46e7 9.46e7 ForSale For Sa… 1672547 https… TRUE https:…
## 4 55 96 8.19e7 8.19e7 ForSale For Sa… 1701687 https… TRUE https:…
## 5 56 97 1.26e8 1.26e8 ForSale For Sa… 1701077 https… TRUE https:…
## 6 64 105 2.20e8 2.20e8 ForSale For Sa… 1696777 https… TRUE https:…
## 7 75 116 1.20e8 1.20e8 ForSale For Sa… NA https… TRUE https:…
## 8 84 125 7.25e7 7.25e7 ForSale For Sa… 1676462 https… TRUE https:…
## 9 91 132 7.25e7 7.25e7 ForSale For Sa… 1689159 https… TRUE https:…
## 10 92 133 7.25e7 7.25e7 ForSale For Sa… 1695364 https… TRUE https:…
## # … with 1,573 more rows, 61 more variables: statusType <chr>,
## # statusText <chr>, countryCurrency <chr>, price <dbl>,
## # unformattedPrice <dbl>, address <chr>, addressStreet <chr>,
## # addressCity <chr>, addressState <chr>, addressZipcode <dbl>,
## # isUndisclosedAddress <lgl>, beds <dbl>, baths <dbl>, area <dbl>,
## # latLong <chr>, isZillowOwned <lgl>, variableData <chr>, hdpData <chr>,
## # isSaved <lgl>, isUserClaimingOwner <lgl>, isUserConfirmedClaim <lgl>, …
Add financing information (assuming a 20% downpayment on a 30 year mortgage)
interest_rate = 0.0772
downpayment_percent = 0.2
air12 = interest_rate / 12 #annual interest rate/12
installments = 12 * 30# assuming a 30-year fixed
sales_data_annotated %<>% mutate(downpayment = unformattedPrice*downpayment_percent) %>% mutate(monthly_payment = (unformattedPrice-downpayment)*(air12*(1+air12)^installments)/((1+air12)^installments - 1))
sales_data_annotated %<>% mutate(monthly_payment_w_fees = monthly_payment + hoa_fee)
sales_data_annotated
## # A tibble: 1,583 × 74
## ...1 Unnamed:…¹ zpid id rawHo…² marke…³ provi…⁴ imgSrc hasIm…⁵ detai…⁶
## <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <chr> <lgl> <chr>
## 1 3 44 1.20e8 1.20e8 ForSale For Sa… 1681982 https… TRUE https:…
## 2 21 62 8.20e7 8.20e7 ForSale For Sa… 1682235 https… TRUE https:…
## 3 38 79 9.46e7 9.46e7 ForSale For Sa… 1672547 https… TRUE https:…
## 4 55 96 8.19e7 8.19e7 ForSale For Sa… 1701687 https… TRUE https:…
## 5 56 97 1.26e8 1.26e8 ForSale For Sa… 1701077 https… TRUE https:…
## 6 64 105 2.20e8 2.20e8 ForSale For Sa… 1696777 https… TRUE https:…
## 7 75 116 1.20e8 1.20e8 ForSale For Sa… NA https… TRUE https:…
## 8 84 125 7.25e7 7.25e7 ForSale For Sa… 1676462 https… TRUE https:…
## 9 91 132 7.25e7 7.25e7 ForSale For Sa… 1689159 https… TRUE https:…
## 10 92 133 7.25e7 7.25e7 ForSale For Sa… 1695364 https… TRUE https:…
## # … with 1,573 more rows, 64 more variables: statusType <chr>,
## # statusText <chr>, countryCurrency <chr>, price <dbl>,
## # unformattedPrice <dbl>, address <chr>, addressStreet <chr>,
## # addressCity <chr>, addressState <chr>, addressZipcode <dbl>,
## # isUndisclosedAddress <lgl>, beds <dbl>, baths <dbl>, area <dbl>,
## # latLong <chr>, isZillowOwned <lgl>, variableData <chr>, hdpData <chr>,
## # isSaved <lgl>, isUserClaimingOwner <lgl>, isUserConfirmedClaim <lgl>, …
Add CONDO vs CO-OP If you look up online, “D4-ELEVATOR APT”, “C6-WALK-UP APARTMENT” buildings are CO-OP buildings while R0-CONDOMINIUMS are CONDO buildings. There are some other weird types in rare cases but they are very few and we want to ignore them for this analysis.
sales_data_annotated %<>% filter(building_type %in% c("D4-ELEVATOR APT", "C6-WALK-UP APARTMENT", "R0-CONDOMINIUMS")) %>% mutate(building_type_simple = case_when(
building_type %in% c("D4-ELEVATOR APT", "C6-WALK-UP APARTMENT ") ~ "CO-OP",
building_type %in% c("R0-CONDOMINIUMS") ~ "CONDO"
))
sales_data_annotated %<>% filter(building_type_simple %in% c("CO-OP", "CONDO"))
sales_data_annotated # 1583 - 1368 = 215 (listings removed)
## # A tibble: 1,368 × 75
## ...1 Unnamed:…¹ zpid id rawHo…² marke…³ provi…⁴ imgSrc hasIm…⁵ detai…⁶
## <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <chr> <lgl> <chr>
## 1 3 44 1.20e8 1.20e8 ForSale For Sa… 1681982 https… TRUE https:…
## 2 21 62 8.20e7 8.20e7 ForSale For Sa… 1682235 https… TRUE https:…
## 3 38 79 9.46e7 9.46e7 ForSale For Sa… 1672547 https… TRUE https:…
## 4 55 96 8.19e7 8.19e7 ForSale For Sa… 1701687 https… TRUE https:…
## 5 64 105 2.20e8 2.20e8 ForSale For Sa… 1696777 https… TRUE https:…
## 6 84 125 7.25e7 7.25e7 ForSale For Sa… 1676462 https… TRUE https:…
## 7 91 132 7.25e7 7.25e7 ForSale For Sa… 1689159 https… TRUE https:…
## 8 92 133 7.25e7 7.25e7 ForSale For Sa… 1695364 https… TRUE https:…
## 9 95 136 1.22e8 1.22e8 ForSale For Sa… 1628931 https… TRUE https:…
## 10 110 151 7.25e7 7.25e7 ForSale For Sa… 1697649 https… TRUE https:…
## # … with 1,358 more rows, 65 more variables: statusType <chr>,
## # statusText <chr>, countryCurrency <chr>, price <dbl>,
## # unformattedPrice <dbl>, address <chr>, addressStreet <chr>,
## # addressCity <chr>, addressState <chr>, addressZipcode <dbl>,
## # isUndisclosedAddress <lgl>, beds <dbl>, baths <dbl>, area <dbl>,
## # latLong <chr>, isZillowOwned <lgl>, variableData <chr>, hdpData <chr>,
## # isSaved <lgl>, isUserClaimingOwner <lgl>, isUserConfirmedClaim <lgl>, …
I’m not really interested in this, so this won’t go very deep, but I thought it would be an interesting to give it a shoot and see what comes out…
How beds, bathrooms, sqft, condo vs. co-op and neighborhood affect sales price
lm_fit <- lm(formula = price ~ beds + baths + neighborhood + sqft + building_type_simple, data = sales_data_annotated)
summary(lm_fit)
##
## Call:
## lm(formula = price ~ beds + baths + neighborhood + sqft + building_type_simple,
## data = sales_data_annotated)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13260409 -966781 68449 739387 31161632
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -2505149.1 461381.9 -5.430
## beds -265270.9 118452.5 -2.239
## baths 2056050.5 133410.2 15.411
## neighborhoodChelsea and Clinton 1064340.1 454226.4 2.343
## neighborhoodEast Harlem 986360.2 736402.2 1.339
## neighborhoodGramercy Park and Murray Hill 720320.9 453687.4 1.588
## neighborhoodGreenwich Village and Soho 1453343.3 516649.0 2.813
## neighborhoodInwood and Washington Heights 899976.3 609754.8 1.476
## neighborhoodLower East Side 1053756.9 520900.9 2.023
## neighborhoodLower Manhattan 428866.3 490020.9 0.875
## neighborhoodUpper East Side 279365.2 459893.7 0.607
## neighborhoodUpper West Side 1174989.4 469028.4 2.505
## neighborhoodWest Side 190457.7 651111.0 0.293
## sqft 1191.5 110.1 10.823
## building_type_simpleCONDO -684848.3 173475.0 -3.948
## Pr(>|t|)
## (Intercept) 6.69e-08 ***
## beds 0.02529 *
## baths < 2e-16 ***
## neighborhoodChelsea and Clinton 0.01926 *
## neighborhoodEast Harlem 0.18066
## neighborhoodGramercy Park and Murray Hill 0.11259
## neighborhoodGreenwich Village and Soho 0.00498 **
## neighborhoodInwood and Washington Heights 0.14019
## neighborhoodLower East Side 0.04328 *
## neighborhoodLower Manhattan 0.38162
## neighborhoodUpper East Side 0.54365
## neighborhoodUpper West Side 0.01236 *
## neighborhoodWest Side 0.76994
## sqft < 2e-16 ***
## building_type_simpleCONDO 8.29e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2562000 on 1344 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.5991, Adjusted R-squared: 0.5949
## F-statistic: 143.4 on 14 and 1344 DF, p-value: < 2.2e-16
0.59 R^2….its something, but its not great! Interesting baths is more significant than beds (which I’m certain are correlated, but baths probably conveys more information)
Now lets look at rental data
rental_data$sqft <- rental_data$area
lm_fit <- lm(formula = price ~ beds + baths + neighborhood + sqft, data = rental_data)
summary(lm_fit)
##
## Call:
## lm(formula = price ~ beds + baths + neighborhood + sqft, data = rental_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33194 -998 -63 862 57439
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -4934.1482 553.0199 -8.922
## beds -893.3074 121.2913 -7.365
## baths 1496.1275 190.7674 7.843
## neighborhoodChelsea and Clinton 4142.8502 565.2244 7.330
## neighborhoodEast Harlem 1280.7189 729.3401 1.756
## neighborhoodGramercy Park and Murray Hill 3510.6025 561.4830 6.252
## neighborhoodGreenwich Village and Soho 7476.6158 668.1046 11.191
## neighborhoodInwood and Washington Heights 347.2018 757.4597 0.458
## neighborhoodLower East Side 3796.5844 644.1673 5.894
## neighborhoodLower Manhattan 3187.5096 572.4817 5.568
## neighborhoodTribeca 4779.4332 2684.6282 1.780
## neighborhoodUpper East Side 3900.3133 607.8712 6.416
## neighborhoodUpper West Side 4170.0659 562.3841 7.415
## neighborhoodWest Side 4096.6281 736.3199 5.564
## sqft 7.4615 0.2012 37.086
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## beds 2.51e-13 ***
## baths 6.88e-15 ***
## neighborhoodChelsea and Clinton 3.25e-13 ***
## neighborhoodEast Harlem 0.0792 .
## neighborhoodGramercy Park and Murray Hill 4.86e-10 ***
## neighborhoodGreenwich Village and Soho < 2e-16 ***
## neighborhoodInwood and Washington Heights 0.6467
## neighborhoodLower East Side 4.37e-09 ***
## neighborhoodLower Manhattan 2.90e-08 ***
## neighborhoodTribeca 0.0752 .
## neighborhoodUpper East Side 1.71e-10 ***
## neighborhoodUpper West Side 1.74e-13 ***
## neighborhoodWest Side 2.97e-08 ***
## sqft < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3720 on 2156 degrees of freedom
## (4103 observations deleted due to missingness)
## Multiple R-squared: 0.7691, Adjusted R-squared: 0.7676
## F-statistic: 512.9 on 14 and 2156 DF, p-value: < 2.2e-16
Wow, 0.76 R^2, that’s not bad!
We actually have in rental data the “zestimate” which is Zillow’s predictions, lets see how correlated this compares against that
predicted_values <- predict(lm_fit, newdata = sales_data)
sales_data$predicted_rent <- predicted_values
sales_data %>% select(zestimate, predicted_rent) %>% drop_na() %>% cor()
## zestimate predicted_rent
## zestimate 1.0000000 0.5564566
## predicted_rent 0.5564566 1.0000000
0.55, so not that great! Zillow’s estimate is pretty different. I saw some negative values in there too, which is a clear sign of not being in the same range and bad extrapolation!
# Tally neighborhoods
neighborhood_counts_rentals <- data.frame(table(rental_data$neighborhood))
neighborhood_counts_sales <- data.frame(table(sales_data$neighborhood))
Is the subset of data generally representative of the total sales data (is it skewed towards some neighborhoods over others)?
neighborhood_counts_sales_annotated <- data.frame(table(sales_data_annotated$neighborhood))
neighborhood_counts_sales_annotated %>% arrange(desc(Freq))
## Var1 Freq
## 1 Gramercy Park and Murray Hill 312
## 2 Chelsea and Clinton 261
## 3 Upper East Side 245
## 4 Upper West Side 168
## 5 Lower Manhattan 110
## 6 Greenwich Village and Soho 78
## 7 Lower East Side 73
## 8 Central Harlem 37
## 9 Inwood and Washington Heights 37
## 10 West Side 28
## 11 East Harlem 19
Compare the ratio of frequency for each neighborhood, in a perfectly representative subset, the ratio would be exactly the same for each. Here they’re generally between 4-6. I would say the general order is relatively the same and the ratio is pretty consistent, so I think its generally well sampled in terms of neighborhoods.
neighborhood_counts_sales %>% left_join(neighborhood_counts_sales_annotated, by = c("Var1"="Var1")) %>% mutate(ratio = `Freq.x`/`Freq.y`)
## Var1 Freq.x Freq.y ratio
## 1 Central Harlem 347 37 9.378378
## 2 Chelsea and Clinton 1127 261 4.318008
## 3 East Harlem 122 19 6.421053
## 4 Gramercy Park and Murray Hill 1344 312 4.307692
## 5 Greenwich Village and Soho 463 78 5.935897
## 6 Inwood and Washington Heights 253 37 6.837838
## 7 Lower East Side 449 73 6.150685
## 8 Lower Manhattan 443 110 4.027273
## 9 Tribeca 23 NA NA
## 10 Upper East Side 1479 245 6.036735
## 11 Upper West Side 881 168 5.244048
## 12 West Side 53 28 1.892857
Compare ratio of rental to sales properties by neighborhood
neighborhood_counts_rentals$num_rental_properties <- neighborhood_counts_rentals$Freq
neighborhood_counts_rentals$Freq <- NULL
neighborhood_counts_sales$num_sales_properties <- neighborhood_counts_sales$Freq
neighborhood_counts_sales$Freq <- NULL
neighborhood_counts <- neighborhood_counts_rentals %>% left_join(neighborhood_counts_sales, by = c("Var1"="Var1"))
neighborhood_counts %<>% mutate(ratio = num_rental_properties/num_sales_properties)
neighborhood_counts
## Var1 num_rental_properties num_sales_properties
## 1 Central Harlem 229 347
## 2 Chelsea and Clinton 1337 1127
## 3 East Harlem 232 122
## 4 Gramercy Park and Murray Hill 929 1344
## 5 Greenwich Village and Soho 342 463
## 6 Inwood and Washington Heights 256 253
## 7 Lower East Side 628 449
## 8 Lower Manhattan 559 443
## 9 Tribeca 10 23
## 10 Upper East Side 637 1479
## 11 Upper West Side 1045 881
## 12 West Side 70 53
## ratio
## 1 0.6599424
## 2 1.1863354
## 3 1.9016393
## 4 0.6912202
## 5 0.7386609
## 6 1.0118577
## 7 1.3986637
## 8 1.2618510
## 9 0.4347826
## 10 0.4306964
## 11 1.1861521
## 12 1.3207547
Assuming that the buyer is taking out an 80% mortgage, what is the monthly cost of owning an apartment vs. renting one in Manhattan?
Interestingly rent is higher for studios, then otherwise owning is more expensive.
sales_data_annotated %>% mutate(type = "sale") %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) %>% ggplot(aes(x = type, y = monthly_payment, color = type)) + geom_boxplot(outlier.shape =NA) + geom_jitter(alpha=0.2, size=0.3) + theme_bw() + xlab("Type") + ylab("Monthly Payment ($)") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Monthly Payment in Rentals vs. Sales") + scale_y_log10() + facet_wrap(~beds) +
geom_signif(comparisons = list(c("rental", "sale")),
map_signif_level = TRUE,
textsize = 3,
tip_length = 0.02,
step_increase = 0.1,
y_position = 3)
Although interesting overall, neighborhood also plays apart, so we should look at that:
sales_data_annotated %>% mutate(type = "sale") %>% filter(beds == 0) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 0) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) %>% ggplot(aes(x = type, y = monthly_payment, color = type)) + geom_boxplot(outlier.shape =NA) + geom_jitter(alpha=0.2, size=0.3) + theme_bw() + xlab("Type") + ylab("Monthly Payment ($)") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Monthly Payment in Rentals vs. Sales for Studio Apartments") + scale_y_log10() + facet_wrap(~neighborhood) +
geom_signif(comparisons = list(c("rental", "sale")),
map_signif_level = TRUE,
textsize = 3,
tip_length = 0.02,
step_increase = 0.1,
y_position = 4)
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")
sales_data_annotated %>% mutate(type = case_when(
building_type_simple == "CONDO" ~ "sale_condo",
building_type_simple == "CO-OP" ~ "sale_coop"
)) %>% filter(beds == 0) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 0) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) %>% ggplot(aes(x = type, y = monthly_payment, color = type)) + geom_boxplot(outlier.shape =NA) + geom_jitter(alpha=0.2, size=0.3) + theme_bw() + xlab("Type") + ylab("Monthly Payment ($)") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Monthly Payment in Rentals vs. Sales for Studio Apartments") + scale_y_log10() + facet_wrap(~neighborhood) +
geom_signif(comparisons = list(c("rental", "sale_condo"), c("rental", "sale_coop")),
map_signif_level = TRUE,
textsize = 3,
tip_length = 0.02,
step_increase = 0.1,
y_position = 3)
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")
##### 1 Bedroom
sales_data_annotated %>% mutate(type = "sale") %>% filter(beds == 1) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 1) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) %>% ggplot(aes(x = type, y = monthly_payment, color = type)) + geom_boxplot(outlier.shape =NA) + geom_jitter(alpha=0.2, size=0.3) + theme_bw() + xlab("Type") + ylab("Monthly Payment ($)") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Monthly Payment in Rentals vs. Sales for 1 Bedroom Apartments") + scale_y_log10() + facet_wrap(~neighborhood) +
geom_signif(comparisons = list(c("rental", "sale")),
map_signif_level = TRUE,
textsize = 3,
tip_length = 0.02,
step_increase = 0.1,
y_position = 4)
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")
sales_data_annotated %>% mutate(type = case_when(
building_type_simple == "CONDO" ~ "sale_condo",
building_type_simple == "CO-OP" ~ "sale_coop"
)) %>% filter(beds == 1) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 1) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) %>% ggplot(aes(x = type, y = monthly_payment, color = type)) + geom_boxplot(outlier.shape =NA) + geom_jitter(alpha=0.2, size=0.3) + theme_bw() + xlab("Type") + ylab("Monthly Payment ($)") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Monthly Payment in Rentals vs. Sales for 1 Bedroom Apartments") + scale_y_log10() + facet_wrap(~neighborhood) +
geom_signif(comparisons = list(c("rental", "sale_condo"), c("rental", "sale_coop")),
map_signif_level = TRUE,
textsize = 3,
tip_length = 0.02,
step_increase = 0.1,
y_position = 3)
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")
##### 2 Bedroom
sales_data_annotated %>% mutate(type = "sale") %>% filter(beds == 2) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 2) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) %>% ggplot(aes(x = type, y = monthly_payment, color = type)) + geom_boxplot(outlier.shape =NA) + geom_jitter(alpha=0.2, size=0.3) + theme_bw() + xlab("Type") + ylab("Monthly Payment ($)") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Monthly Payment in Rentals vs. Sales for 2 Bedroom Apartments") + scale_y_log10() + facet_wrap(~neighborhood) +
geom_signif(comparisons = list(c("rental", "sale")),
map_signif_level = TRUE,
textsize = 3,
tip_length = 0.02,
step_increase = 0.1,
y_position = 4)
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")
sales_data_annotated %>% mutate(type = case_when(
building_type_simple == "CONDO" ~ "sale_condo",
building_type_simple == "CO-OP" ~ "sale_coop"
)) %>% filter(beds == 2) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 2) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) %>% ggplot(aes(x = type, y = monthly_payment, color = type)) + geom_boxplot(outlier.shape =NA) + geom_jitter(alpha=0.2, size=0.3) + theme_bw() + xlab("Type") + ylab("Monthly Payment ($)") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Monthly Payment in Rentals vs. Sales for 2 Bedroom Apartments") + scale_y_log10() + facet_wrap(~neighborhood) +
geom_signif(comparisons = list(c("rental", "sale_condo"), c("rental", "sale_coop")),
map_signif_level = TRUE,
textsize = 3,
tip_length = 0.02,
step_increase = 0.1,
y_position = 3)
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")
sales_data_annotated %>% mutate(type = "sale") %>% filter(beds == 3) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 3) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) %>% ggplot(aes(x = type, y = monthly_payment, color = type)) + geom_boxplot(outlier.shape =NA) + geom_jitter(alpha=0.2, size=0.3) + theme_bw() + xlab("Type") + ylab("Monthly Payment ($)") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Monthly Payment in Rentals vs. Sales for 3 Bedroom Apartments") + scale_y_log10() + facet_wrap(~neighborhood) +
geom_signif(comparisons = list(c("rental", "sale")),
map_signif_level = TRUE,
textsize = 3,
tip_length = 0.02,
step_increase = 0.1,
y_position = 4)
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")
sales_data_annotated %>% mutate(type = case_when(
building_type_simple == "CONDO" ~ "sale_condo",
building_type_simple == "CO-OP" ~ "sale_coop"
)) %>% filter(beds == 3) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 3) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) %>% ggplot(aes(x = type, y = monthly_payment, color = type)) + geom_boxplot(outlier.shape =NA) + geom_jitter(alpha=0.2, size=0.3) + theme_bw() + xlab("Type") + ylab("Monthly Payment ($)") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Monthly Payment in Rentals vs. Sales for 3 Bedroom Apartments") + scale_y_log10() + facet_wrap(~neighborhood) +
geom_signif(comparisons = list(c("rental", "sale_condo"), c("rental", "sale_coop")),
map_signif_level = TRUE,
textsize = 3,
tip_length = 0.02,
step_increase = 0.1,
y_position = 3)
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")
##### 4 Bedroom
sales_data_annotated %>% mutate(type = "sale") %>% filter(beds == 4) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 4) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) %>% ggplot(aes(x = type, y = monthly_payment, color = type)) + geom_boxplot(outlier.shape =NA) + geom_jitter(alpha=0.2, size=0.3) + theme_bw() + xlab("Type") + ylab("Monthly Payment ($)") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Monthly Payment in Rentals vs. Sales for 4 Bedroom Apartments") + scale_y_log10() + facet_wrap(~neighborhood) +
geom_signif(comparisons = list(c("rental", "sale")),
map_signif_level = TRUE,
textsize = 3,
tip_length = 0.02,
step_increase = 0.1,
y_position = 4)
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")
sales_data_annotated %>% mutate(type = case_when(
building_type_simple == "CONDO" ~ "sale_condo",
building_type_simple == "CO-OP" ~ "sale_coop"
)) %>% filter(beds == 4) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 4) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) %>% ggplot(aes(x = type, y = monthly_payment, color = type)) + geom_boxplot(outlier.shape =NA) + geom_jitter(alpha=0.2, size=0.3) + theme_bw() + xlab("Type") + ylab("Monthly Payment ($)") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Monthly Payment in Rentals vs. Sales for 4 Bedroom Apartments") + scale_y_log10() + facet_wrap(~neighborhood) +
geom_signif(comparisons = list(c("rental", "sale_condo"), c("rental", "sale_coop")),
map_signif_level = TRUE,
textsize = 3,
tip_length = 0.02,
step_increase = 0.1,
y_position = 3)
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")
Generally, it seems that as bedrooms increase, the difference in monthly payment between rentals and sales increase, there also seem to be some strong neighborhood effects. Interestingly, it seems like there isn’t a significant difference in overall monthly cost for large swaths of apartments, particularly in the 0 and 1 bedroom ranges.
test_data <- sales_data_annotated %>% mutate(type = "sale") %>% filter(beds == 0) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 0) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type))
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")
test_data %>% filter(sqft >0 ) %>% mutate(monthly_price_sqft = monthly_payment/sqft) %>% filter(type == "sale") %>% pull(monthly_price_sqft) %>% median() / test_data %>% filter(sqft >0 ) %>% mutate(monthly_price_sqft = monthly_payment/sqft) %>% filter(type == "rental") %>% pull(monthly_price_sqft) %>% median()
## [1] 1.042845
The average monthly cost of a studio in Manhattan is only 5% higher than a Rental (4% median)
test_data <- sales_data_annotated %>% mutate(type = "sale") %>% filter(beds == 1) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 1) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type))
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")
test_data %>% filter(sqft >0 ) %>% mutate(monthly_price_sqft = monthly_payment/sqft) %>% filter(type == "sale") %>% pull(monthly_price_sqft) %>% median() / test_data %>% filter(sqft >0 ) %>% mutate(monthly_price_sqft = monthly_payment/sqft) %>% filter(type == "rental") %>% pull(monthly_price_sqft) %>% median()
## [1] 1.08446
10% higher for 1 bedrooms (mean), (8% median)
test_data <- sales_data_annotated %>% mutate(type = "sale") %>% filter(beds == 2) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 2) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type))
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")
test_data %>% filter(sqft >0 ) %>% mutate(monthly_price_sqft = monthly_payment/sqft) %>% filter(type == "sale") %>% pull(monthly_price_sqft) %>% median() / test_data %>% filter(sqft >0 ) %>% mutate(monthly_price_sqft = monthly_payment/sqft) %>% filter(type == "rental") %>% pull(monthly_price_sqft) %>% median()
## [1] 1.288779
29% higher for 2 bedrooms (29% median)
test_data <- sales_data_annotated %>% mutate(type = "sale") %>% filter(beds == 3) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 3) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type))
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")
test_data %>% filter(sqft >0 ) %>% mutate(monthly_price_sqft = monthly_payment/sqft) %>% filter(type == "sale") %>% pull(monthly_price_sqft) %>% median() / test_data %>% filter(sqft >0 ) %>% mutate(monthly_price_sqft = monthly_payment/sqft) %>% filter(type == "rental") %>% pull(monthly_price_sqft) %>% median()
## [1] 1.46875
57% for 3 bedrooms (47% median)
test_data <- sales_data_annotated %>% mutate(type = "sale") %>% filter(beds == 4) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 4) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type))
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")
test_data %>% filter(sqft >0 ) %>% mutate(monthly_price_sqft = monthly_payment/sqft) %>% filter(type == "sale") %>% pull(monthly_price_sqft) %>% median() / test_data %>% filter(sqft >0 ) %>% mutate(monthly_price_sqft = monthly_payment/sqft) %>% filter(type == "rental") %>% pull(monthly_price_sqft) %>% median()
## [1] 1.659079
111% for 4 bedrooms (66% median)
There seems to be almost double the amount of condos to co-ops listed in this subset
building_type_counts <- data.frame(table(sales_data_annotated$building_type_simple))
building_type_counts
## Var1 Freq
## 1 CO-OP 487
## 2 CONDO 881
Wherever there is data comparing CO-OPs and CONDOs, CO-OPs have much higher HOA fees.
# Define a custom y position for the p-value labels
custom_label_y <- exp(mean(log(sales_data_annotated$hoa_fee), na.rm = TRUE))
# Assuming `sales_data_annotated` is your data frame
sales_data_annotated %>%
filter(beds < 6) %>%
ggplot(aes(x = neighborhood, y = hoa_fee, color = building_type_simple)) +
geom_boxplot(outlier.shape = NA) +
theme_bw() +
xlab("Neighborhood") +
ylab("HOA Fees") +
theme(legend.position = "right",
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
ggtitle("HOA Fees by # Beds and Neighborhood") +
scale_y_log10() +
facet_wrap(~beds)
How does the price per squarefoot compare between Co-Ops and Condos?
sales_data_annotated %>% filter(sqft > 0) %>% mutate(monthly_payment_per_sqft = price/sqft) %>% ggplot(aes(x = building_type_simple, y = monthly_payment_per_sqft, color = building_type_simple)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(alpha = 0.2) +
theme_bw() +
xlab("Neighborhood") +
ylab("Price per Sqft") +
theme(legend.position = "right",
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
ggtitle("Price/Sqft of Co-Op vs. Condo") +
stat_compare_means(method = "t.test", label = "p.signif", comparisons = list(c("CO-OP", "CONDO")))
## [1] FALSE
price_per_sqft_data <- sales_data_annotated %>%
filter(sqft > 0) %>%
mutate(monthly_payment_per_sqft = price / sqft)
# Calculate the mean price per square foot for each building type
mean_price_per_sqft <- price_per_sqft_data %>%
group_by(building_type_simple) %>%
summarise(mean_price_per_sqft = mean(monthly_payment_per_sqft, na.rm = TRUE))
mean_price_per_sqft
## # A tibble: 2 × 2
## building_type_simple mean_price_per_sqft
## <chr> <dbl>
## 1 CO-OP 1109.
## 2 CONDO 1711.
1711.269/1109.424 = 1.54 (about 50% more expensive per sqft on average)
sales_data_annotated %>% filter(building_type_simple == "CONDO", neighborhood == "Upper East Side", beds == 1) %>% pull(historical_cagr) %>% median()
## [1] 0.04784556
When I looked it up, I saw the CAGR for the S&P500 is 9.27%, so I’m going to use that as a benchmark to compare against the CAGR for these listings
sales_data_annotated %>% ggplot(aes(x = neighborhood, y = historical_cagr, color = building_type_simple)) + geom_boxplot(outlier.shape =NA) + theme_bw() + xlab("Neighborhood") + ylab("Historical CAGR") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("All Units") + scale_y_log10() + geom_hline(yintercept = 0.0927, linetype = "dashed", color = "red")
The baseline historical CAGR for these apartments seems to be below the
S&P 500 (but you can actually multiply it using a mortgage, which
you can’t really do with others)
Facet By Beds (1-4)
sales_data_annotated %>% filter(beds < 6) %>% ggplot(aes(x = neighborhood, y = historical_cagr, color = building_type_simple)) + geom_boxplot(outlier.shape =NA)+ theme_bw() + xlab("Neighborhood") + ylab("Historical CAGR") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Historic Returns by # Beds and Neighborhood") + scale_y_log10() + geom_hline(yintercept = 0.0927, linetype = "dashed", color = "red") + facet_wrap(~beds)
Ideally, we want to identify a neighborhood x apartment type that has high historic returns with lower volatility, and many data points so that we have higher confidence in the data
While 4 bedroom condos in Lower Manhattan have the best overall profile (9% median returns with .2% SD, there are only 2 points so its not reliable)
# n = 25 is arbitrary
sales_data_annotated %>%
group_by(beds, building_type_simple, neighborhood) %>% filter(n() > 25) %>%
summarise(
median_cagr = median(historical_cagr, na.rm = TRUE),
std_dev = sd(historical_cagr, na.rm = TRUE),
count = n(),
ratio = median(historical_cagr, na.rm = TRUE)/sd(historical_cagr, na.rm = TRUE)
) %>% arrange(desc(ratio))
## `summarise()` has grouped output by 'beds', 'building_type_simple'. You can
## override using the `.groups` argument.
## # A tibble: 17 × 7
## # Groups: beds, building_type_simple [6]
## beds building_type_simple neighborhood media…¹ std_dev count ratio
## <dbl> <chr> <chr> <dbl> <dbl> <int> <dbl>
## 1 1 CONDO Upper East Side 0.0478 0.0383 39 1.25
## 2 1 CONDO Upper West Side 0.0581 0.111 38 0.524
## 3 0 CO-OP Gramercy Park and Mu… 0.0301 0.0657 31 0.458
## 4 1 CONDO Chelsea and Clinton 0.0423 0.109 84 0.388
## 5 2 CONDO Chelsea and Clinton 0.0391 0.136 73 0.287
## 6 1 CONDO Gramercy Park and Mu… 0.0519 0.191 50 0.271
## 7 2 CONDO Upper West Side 0.0576 0.220 30 0.261
## 8 1 CO-OP Gramercy Park and Mu… 0.0217 0.0848 72 0.255
## 9 2 CO-OP Upper East Side 0.0428 0.186 46 0.230
## 10 2 CONDO Gramercy Park and Mu… 0.0521 0.311 57 0.168
## 11 1 CO-OP Upper East Side 0.0317 0.192 47 0.165
## 12 1 CO-OP Chelsea and Clinton 0.0483 0.298 26 0.162
## 13 2 CONDO Upper East Side 0.0436 0.272 43 0.160
## 14 1 CONDO Lower Manhattan 0.0237 0.195 42 0.122
## 15 2 CONDO Lower Manhattan 0.0244 0.211 33 0.116
## 16 3 CONDO Gramercy Park and Mu… 0.0194 0.232 26 0.0834
## 17 2 CO-OP Gramercy Park and Mu… 0.0412 1.76 38 0.0234
## # … with abbreviated variable name ¹median_cagr
1 Bedroom Condos in the upper east side seem to be the best, with ~5% median CAGR with 4% standard deviation and 39 data points.
At the lowest end, you’d need an annual income of at least 137151.5 and $99,0000, but you could own a very good apartment. To afford median monthly payment, you’d want $218k annual salary
Minimum:
listing_data <- sales_data_annotated
# Use the 40X rule to see what minimum income you'd need to afford
listing_data %>% filter(neighborhood == "Upper East Side", building_type_simple=="CONDO", beds == 1) %>% pull(monthly_payment) %>% min() * 40 # $113,151.5
## [1] 113151.5
listing_data %>% filter(neighborhood == "Upper East Side", building_type_simple=="CONDO", beds == 1) %>% pull(downpayment) %>% min() # $99,000
## [1] 99000
Median:
# Use the 40X rule to see what minimum income you'd need to afford
listing_data %>% filter(neighborhood == "Upper East Side", building_type_simple=="CONDO", beds == 1) %>% pull(monthly_payment) %>% median() * 40 # $218,302
## [1] 218302.4
listing_data %>% filter(neighborhood == "Upper East Side", building_type_simple=="CONDO", beds == 1) %>% pull(downpayment) %>% median() # $191,000
## [1] 191000
We want to see what the equity in the property would be after 5 years, and also what the appreciation on it would be like. To compute this, we use the historical CAGR of the property to project the future value and then calculate the equity built after several years by building an amoritization table for each property.
listing_data$remaining_balance <- NULL
listing_data$equity <- NULL
n_years <- 5 # years
n_years_in_months <- n_years*12
listing_data$loan_amount <- listing_data$downpayment / downpayment_percent
# Parameters
annual_interest_rate <- 0.0772 # 2% annual interest
compounding_frequency <- 12 # Monthly compounding
payment_frequency <- 12 # Monthly payments
loan_term_years <- 30
balance_at_target_year <- c()
for (i in seq(length(listing_data$`Unnamed: 0`))) {
# Calculate the amortization table for each property
amortization_table <- amort.table(
Loan = listing_data[i, ] %>% pull(loan_amount),
n = loan_term_years * payment_frequency, # Total number of payments
i = annual_interest_rate,
ic = compounding_frequency,
pf = payment_frequency,
plot = FALSE # Set TRUE if you want to plot payment proportions
)
# extract the data
data.frame(amortization_table$Schedule)[n_years_in_months, ] %>% pull(`Principal.Paid`)
balance_at_target_year <- c(balance_at_target_year, data.frame(amortization_table$Schedule)[n_years_in_months, ] %>% pull(`Balance`))
}
options(scipen = 999)
listing_data$balance_at_target_year <- balance_at_target_year
listing_data$equity_at_target_year <- listing_data$loan_amount - listing_data$balance_at_target_year + listing_data$downpayment
listing_data$balance_paid_off_at_target_year <- listing_data$loan_amount - listing_data$balance_at_target_year
# this is purely appreciation on the initial down payment (not considering equity)
listing_data$est_roi_on_downpayment <- (listing_data$historical_cagr*n_years*listing_data$unformattedPrice)/listing_data$downpayment
listing_data$est_roi_vs_benchmark <- (listing_data$est_roi_on_downpayment - 0.0927*n_years)/n_years
listing_data
## # A tibble: 1,368 × 81
## ...1 Unnamed:…¹ zpid id rawHo…² marke…³ provi…⁴ imgSrc hasIm…⁵ detai…⁶
## <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <chr> <lgl> <chr>
## 1 3 44 1.20e8 1.20e8 ForSale For Sa… 1681982 https… TRUE https:…
## 2 21 62 8.20e7 8.20e7 ForSale For Sa… 1682235 https… TRUE https:…
## 3 38 79 9.46e7 9.46e7 ForSale For Sa… 1672547 https… TRUE https:…
## 4 55 96 8.19e7 8.19e7 ForSale For Sa… 1701687 https… TRUE https:…
## 5 64 105 2.20e8 2.20e8 ForSale For Sa… 1696777 https… TRUE https:…
## 6 84 125 7.25e7 7.25e7 ForSale For Sa… 1676462 https… TRUE https:…
## 7 91 132 7.25e7 7.25e7 ForSale For Sa… 1689159 https… TRUE https:…
## 8 92 133 7.25e7 7.25e7 ForSale For Sa… 1695364 https… TRUE https:…
## 9 95 136 1.22e8 1.22e8 ForSale For Sa… 1628931 https… TRUE https:…
## 10 110 151 7.25e7 7.25e7 ForSale For Sa… 1697649 https… TRUE https:…
## # … with 1,358 more rows, 71 more variables: statusType <chr>,
## # statusText <chr>, countryCurrency <chr>, price <dbl>,
## # unformattedPrice <dbl>, address <chr>, addressStreet <chr>,
## # addressCity <chr>, addressState <chr>, addressZipcode <dbl>,
## # isUndisclosedAddress <lgl>, beds <dbl>, baths <dbl>, area <dbl>,
## # latLong <chr>, isZillowOwned <lgl>, variableData <chr>, hdpData <chr>,
## # isSaved <lgl>, isUserClaimingOwner <lgl>, isUserConfirmedClaim <lgl>, …
We look at all the listings and their historical CAGR to see how well they would perform against the S&P 500, generally most (if financed) would yield a greater ROI on the down payment, but have high volatility
# Calculate the IQR and determine bounds for outlier removal
listing_data_2 <- listing_data %>% filter(beds < 4) %>%
group_by(neighborhood, building_type_simple) %>%
mutate(Q1 = quantile(est_roi_vs_benchmark, 0.25),
Q3 = quantile(est_roi_vs_benchmark, 0.75)) %>%
mutate(IQR = Q3 - Q1,
Lower_Bound = Q1 - 1.5 * IQR,
Upper_Bound = Q3 + 1.5 * IQR) %>%
filter(est_roi_vs_benchmark >= Lower_Bound & est_roi_vs_benchmark <= Upper_Bound) %>%
ungroup()
# only look at neighborhoods with data > 50 (higher certainty)
listing_data_2 %<>% group_by(neighborhood) %>% filter(n() > 50)
# Plotting the filtered data without outliers
ggplot(listing_data_2, aes(x = neighborhood, y = est_roi_vs_benchmark, color = building_type_simple)) +
geom_boxplot() + # Now the boxplot will naturally have no outliers as they were filtered out
geom_jitter(alpha = 0.4) + # Adds a jitter plot to display all data points
theme_bw() + # Applies a minimal theme
xlab("Neighborhood") +
ylab("Excess CAGR vs. Benchmark") +
theme(legend.position = "right",
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ggtitle("All Units") + geom_hline(yintercept = 0, linetype = "dashed", color = "red") + facet_wrap(~beds) +
geom_signif(comparisons = list(c("CO-OP", "CONDO")),
map_signif_level = TRUE,
textsize = 3,
tip_length = 0.02,
step_increase = 0.1,
y_position = 0)
listing_data %>% filter(est_roi_vs_benchmark > 0) %>% pull(est_roi_vs_benchmark) %>% length()
## [1] 950
listing_data %>% filter(est_roi_vs_benchmark < 0) %>% pull(est_roi_vs_benchmark) %>% length()
## [1] 418
After 5 years, 418 did not beat benchmark, 950 did (69% of total)
Studio: 71.6% 1 Bedroom: 70.0% 2 Bedroom: 72.4% 3 Bedroom: 61.3% 4 Bedroom: 59.2%
bed_num = 4
listing_data %>% filter(beds == bed_num) %>% filter(est_roi_vs_benchmark > 0) %>% pull(est_roi_vs_benchmark) %>% length() / (listing_data %>% filter(beds == bed_num) %>% pull(est_roi_vs_benchmark)) %>% length()
## [1] 0.5918367
# Try compare hoa fee with cagr directly (no significant relationship)
lm_fit <- lm(formula = historical_cagr ~ hoa_fee, data = listing_data)
summary(lm_fit)
##
## Call:
## lm(formula = historical_cagr ~ hoa_fee, data = listing_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0850 -0.0730 -0.0458 -0.0094 30.9711
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.087925088 0.035617787 2.469 0.0137 *
## hoa_fee -0.000001456 0.000011366 -0.128 0.8981
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9111 on 1360 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 1.206e-05, Adjusted R-squared: -0.0007232
## F-statistic: 0.0164 on 1 and 1360 DF, p-value: 0.8981
# Try to normalize by taking hoa fee per sqft (no significant linear relationship)
lm_fit <- lm(formula = historical_cagr ~ hoa_fee_per_sqft, data = listing_data %>% filter(sqft > 0) %>% mutate(hoa_fee_per_sqft = hoa_fee/sqft))
summary(lm_fit)
##
## Call:
## lm(formula = historical_cagr ~ hoa_fee_per_sqft, data = listing_data %>%
## filter(sqft > 0) %>% mutate(hoa_fee_per_sqft = hoa_fee/sqft))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.04916 -0.04221 -0.01268 0.02683 1.91377
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.044416 0.016636 2.670 0.00773 **
## hoa_fee_per_sqft 0.005559 0.009378 0.593 0.55351
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2146 on 891 degrees of freedom
## (4 observations deleted due to missingness)
## Multiple R-squared: 0.0003942, Adjusted R-squared: -0.0007277
## F-statistic: 0.3513 on 1 and 891 DF, p-value: 0.5535
Plot to visually inspect
listing_data %>% ggplot(aes(x = hoa_fee, y = historical_cagr)) + geom_point() + theme_bw() + scale_y_log10() + scale_x_log10() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'